In [2]:
import pandas as pd
import numpy as np
from numpy import exp, sqrt, dot
from scipy.spatial.distance import cdist
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import scale
from sklearn.model_selection import KFold
from sklearn.svm import SVR
from hyperopt import STATUS_OK, hp, fmin, tpe, Trials, space_eval
In [3]:
def load_data():
full_data = pd.read_csv("Data/X.csv")
train_y = pd.read_csv("Data/y_train.csv")
# Rename columns to something more interpretable
columns = (["reflectance_" + str(i) for i in range(7)]
+ ["solar_" + str(i) for i in range(5)] + ["id"])
full_data.columns = columns
# Move ID column to the beginning
id_column = full_data["id"]
full_data.drop(labels=["id"], axis=1, inplace = True)
full_data.insert(0, "id", id_column)
# Add the target value column to the training part
# in full_data
split = 98000
y_id_dict = train_y.set_index("id")["y"].to_dict()
full_data.loc[:(split-1), "y"] = full_data.loc[:(split-1), "id"].map(y_id_dict)
# Split into training and testing data
train, test = full_data[:split], full_data[split:]
return (train, test)
train, test = load_data()
In [4]:
random_seed = 8
# Set folds for out-of-fold prediction
n_folds = 5
In [5]:
cols_excl = ["id", "y"]
cols_orig = [c for c in train.columns if c not in cols_excl]
# Standardise data can make training faster and reduce
# the chances of getting stuck in local optima
train[cols_orig] = scale(train[cols_orig])
test[cols_orig] = scale(test[cols_orig])
Instead of fitting a model to the instances, the idea of distribution regression is to find a regression on the underlying probability distributions the instances come from. It is based on the assumption that the data is $\{(x_i, y_i)\}_{i=1}^{n}$ with:
However, $x_i$ is not observed: for each bag $i$, the $100$ instances $x_{i,l}$, $l=1,...,100$, are samples from the distribution $x_i$. Our dataset is thus $\{(\{x_{i,l}\}_{l=1}^{100}, y_i)\}_{i=1}^{n}$ and we want to find a mapping $\hat{f}$ that will best predict unseen bags.
The mapping $\hat{f}$ on $\{(\{x_{i,l}\}_{l=1}^{100}, y_i)\}_{i=1}^{n}$ will try to learn the relationship between the true distributions $\{x_i\}_{i=1}^{n}$ and the target values $\{y_i\}_{i=1}^{n}$. To achieve that, the information of the 100 instances in each bag has to be summarised whilst losing the less information possible. The aggregated approach that simply computes the mean of the features for each bag is an example of information summary, yet plenty of data is lost that way. A better way to represent each bag is via kernel mean embedding: $$\mu_{\hat{x}_i} = \frac{1}{100}\sum_{l=1}^{100} k(\cdot, x_{i,l})$$
Each bag is represented as a linear combination of kernels, and with the right choice of kernel, the lost information can be very negligible.
We now want to find $\hat{f}$ that minimises the following regularised least square problem: $$ \underset{f}{arg min} \sum_{i=1}^{n} (f(\mu_{\hat{x}_i}) - y_i)^2 + \lambda \Vert f \Vert^2$$
with $\lambda>0$ the L2 regularisation parameter.
In kernel ridge regression, $f$ is interpreted as a linear combination of feature space mappings $\phi$ of the data points $\mu_{\hat{x}_i}$: $$ f = \sum_{i=1}^{n} \alpha_i \phi(\mu_{\hat{x}_i} ) $$
The equation thus becomes: $$ \underset{\alpha}{arg min} (\Vert y -K\alpha \Vert^2 + \lambda \alpha^T K \alpha)$$ with :
By differentiating with respect to $\alpha$ and setting it to zero: $$ \alpha^{*} = (K + \lambda I_n)^{-1}y $$
For the sake of simplicity and because the results proved to be reasonably good, we set $k'$ as the linear kernel and as a result: $$ K(i,j) = \frac{1}{100^2} \sum_{l,k=1}^{100} k(x_{i,l} , x_{j,k})$$
In [6]:
class Kernel(object):
""" Kernel class from Zoltan Szabo
giving the kernel mean embedding.
"""
def __init__(self, par=None):
""" Initialization.
Parameters
----------
par : dictionary, optional
Name of the kernel and its parameters (default is
{"name": "RBF", "sigma": 1}). The name of the kernel comes
from "RBF", "exponential", "Cauchy", "student", "Matern3p2",
"Matern5p2", "polynomial", "ratquadr" (rational quadratic),
"invmquadr" (inverse multiquadr).
"""
if par is None:
par = {"name": "RBF", "sigma": 1}
name = par["name"]
self.name = name
# other attributes:
if name == "RBF" or name == "exponential" or name == "Cauchy":
self.sigma = par["sigma"]
elif name == "student":
self.d = par["d"]
elif name == "Matern3p2" or name == "Matern5p2":
self.l = par["l"]
elif name == "polynomial":
self.c = par["c"]
self.exponent = par["exponent"]
elif name == "ratquadr" or name == "invmquadr":
self.c = par["c"]
else:
raise Exception("kernel=?")
def gram_matrix(self, y1, y2):
""" Compute the Gram matrix = [k(y1[i,:], y2[j,:])]; i, j: running.
Parameters
----------
y1 : (number of samples1, dimension)-ndarray
One row of y1 corresponds to one sample.
y2 : (number of samples2, dimension)-ndarray
One row of y2 corresponds to one sample.
Returns
-------
g : ndarray.
Gram matrix of y1 and y2.
"""
if self.name == "RBF":
sigma = self.sigma
g = cdist(y1, y2)
g = exp(-g ** 2 / (2 * sigma ** 2))
elif self.name == "exponential":
sigma = self.sigma
g = cdist(y1, y2)
g = exp(-g / (2 * sigma ** 2))
elif self.name == "Cauchy":
sigma = self.sigma
g = cdist(y1, y2)
g = 1 / (1 + g ** 2 / sigma ** 2)
elif self.name == "student":
d = self.d
g = cdist(y1, y2)
g = 1 / (1 + g ** d)
elif self.name == "Matern3p2":
l = self.l
g = cdist(y1, y2)
g = (1 + sqrt(3) * g / l) * exp(-sqrt(3) * g / l)
elif self.name == "Matern5p2":
l = self.l
g = cdist(y1, y2)
g = (1 + sqrt(5) * g / l + 5 * g ** 2 / (3 * l ** 2)) * \
exp(-sqrt(5) * g / l)
elif self.name == "polynomial":
c = self.c
exponent = self.exponent
g = (dot(y1, y2.T) + c) ** exponent
elif self.name == "ratquadr":
c = self.c
g = cdist(y1, y2) ** 2
g = 1 - g / (g + c)
elif self.name == "invmquadr":
c = self.c
g = cdist(y1, y2)
g = 1 / sqrt(g ** 2 + c ** 2)
else:
raise Exception("kernel=?")
return g
In [7]:
# Compute the linear kernel product of
# the mean embedding of X1 and X2
# denoted as K(i, j) above
def mean_embedding(X1, X2, kernel):
k = Kernel(kernel)
gram_mat = k.gram_matrix(X1, X2)
# Number of instances in the bag
N = float(gram_mat.shape[0])
mu_X1_X2 = gram_mat.ravel().sum() / N**2
return (mu_X1_X2)
# Return a symmetrised matrix
def symmetrise(A):
return(A + A.T - np.diag(A.diagonal()))
# Compute the Gram matrix K given the kernel and
# the smoothing parameter theta
def compute_gram(df, kernel, theta):
nb_bag = df["id"].nunique()
K_matrix = np.zeros((nb_bag, nb_bag))
print("Computing {0} Gram matrix for theta={1}:".format(kernel, theta))
for i in range(nb_bag):
if (i%50 == 0):
print("Bag number: {0}". format(i))
for j in range(i+1):
# Compute mean embedding
X1 = df.loc[train["id"] == (i+1), cols_orig].values
X2 = df.loc[train["id"] == (j+1), cols_orig].values
K_matrix[i, j] = mean_embedding(X1, X2, {'name': kernel, 'sigma': theta})
return symmetrise(K_matrix)
#K_cauchy = compute_gram(train, "Cauchy", 2**4)
In [8]:
# Class for kernel ridge regression
class RidgeRegression(object):
def __init__(self, l2_reg):
self.l2_reg = l2_reg
def fit(self, G, y):
# Train size
n_train = G.shape[0]
ridge_mat = G + (self.l2_reg * n_train) * np.identity(n_train)
self.ridge_mat = ridge_mat
# Shape of y_train is (1, n_train)
self.y_train = y
def predict(self, G_test):
y_test_hat = self.y_train.dot(np.linalg.solve(self.ridge_mat, G_test))
return y_test_hat
A kernel is characterised by a parameter we will call $\theta$ and the ridge regression depends on the L2 regularisation $\lambda$. Through cross-validation, we selected the kernels giving the most stable validation loss. They are given below with their associated parameters:
We will then map the features in the three spaces that describes the data in different ways. Each kernel ridge regression gives a prediction of the labels and combining them might give a better result for three reasons:
Combining our model might take us a step closer to finding the true model $f$. The ensembling technique we used was out-of-fold stacking.
In the first stage, out-of-fold prediction is applied to ensure that each first-layer regressor does not overfit by predicting on data already seen. For each regressor, we iteratively separate the training data in $N$ folds ($N=5$ in our model), and then use N-1 folds to train the model and then predict the target value of the remaining fold. To create the new testing set, the average of the predictions of each fold is taken.
In [9]:
# G_train and G_test are pandas dataframes
# krr is a kernel ridge regression
def oof_prediction(krr, G_train, y_train, G_test, n_folds, random_seed):
kf = KFold(n_splits=n_folds, shuffle=True, random_state=random_seed)
n_train = G_train.shape[0]
n_test = G_test.shape[1]
oof_train = np.zeros(n_train)
oof_test = np.zeros(n_test)
oof_test_folds = np.zeros((n_test, n_folds))
for i, (train_index, test_index) in enumerate(kf.split(G_train)):
G_tr = G_train.loc[train_index, train_index].values
y_tr = y_train[train_index].reshape((1, -1))
G_te = G_train.loc[train_index, test_index].values
krr.fit(G_tr, y_tr)
oof_train[test_index] = krr.predict(G_te)
G_test_partial = G_test.loc[train_index, :]
oof_test_folds[:, i] = krr.predict(G_test_partial.values)
oof_test = oof_test_folds.mean(axis=1)
return oof_train, oof_test
In [10]:
nb_bags_train = 980
# Create a vector with the unique values of y for each ID.
y_train = train.groupby("id")["y"].median().values
In [11]:
# Load Gram matrices
def load_gram(csv_file, nb_bags_train):
# Import data
G = pd.read_csv(csv_file, header=None)
idx_train = nb_bags_train - 1
idx_test = nb_bags_train
G_train = G.loc[:idx_train, :idx_train]
G_test = G.loc[:idx_train, idx_test:]
return (G_train, G_test)
# Define models and import Gram matrices
# Cauchy
l2_reg_cauchy = 2**(-23)
cauchy = RidgeRegression(l2_reg_cauchy)
G_train_cauchy, G_test_cauchy = load_gram("kernels_me/Cauchy_16.csv", nb_bags_train)
# Matern 5/2
l2_reg_matern_52 = 2**(-31)
matern_52 = RidgeRegression(l2_reg_matern_52)
G_train_matern_52, G_test_matern_52 = load_gram("kernels_me/Matern_52_64.csv", nb_bags_train)
# Rational quadratic
l2_reg_rquadr = 2**(-26)
rquadr = RidgeRegression(l2_reg_rquadr)
G_train_rquadr, G_test_rquadr = load_gram("kernels_me/rquadr_512.csv", nb_bags_train)
In [12]:
# Create OOF train and test predictions
# Cauchy
cauchy_oof_train, cauchy_oof_test = oof_prediction(cauchy, G_train_cauchy,
y_train, G_test_cauchy,
n_folds, random_seed)
# Matern 5/2
matern_52_oof_train, matern_52_oof_test = oof_prediction(matern_52, G_train_matern_52,
y_train, G_test_matern_52,
n_folds, random_seed)
# Rational quadratic
rquadr_oof_train, rquadr_oof_test = oof_prediction(rquadr, G_train_rquadr,
y_train, G_test_rquadr,
n_folds, random_seed)
print("Training is finished.")
In the second stage, a Support Vector Regression is fed with the different predictions from the kernel ridge regression to predict the target value $y$. The SVR uses these predictions to compute the optimal weights assigned to each kernel regression and we might hope to find a better optimum to approximate the true regression function $f$.
In [13]:
# Building the new data frames using the
# of out-of-fold predictions
kernel_train = pd.DataFrame({'cauchy': cauchy_oof_train,
'matern_52': matern_52_oof_train,
'rquadr': rquadr_oof_train})
kernel_train["y"] = y_train
kernel_test = pd.DataFrame({'cauchy': cauchy_oof_test,
'matern_52': matern_52_oof_test,
'rquadr': rquadr_oof_test})
cols_excl_kernel = ["y"]
cols_kernel = [c for c in kernel_train.columns if c not in cols_excl_kernel]
kernel_train.head()
Out[13]:
In [18]:
# Root mean squared error metric
def RMSE(y, y_hat):
out = np.sqrt(mean_squared_error(y.reshape((-1,)), y_hat.reshape((-1,))))
return (out)
In [19]:
def scoring_function(parameters):
print("Training the model with parameters: ")
print(parameters)
# Run several KFold shuffles and take the mean RMSE
average_RMSE = []
nb_run = 10
for m in range(nb_run):
KFold_RMSE = 0.0
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=(random_seed+m))
nb_fold = 0
for train_index, validation_index in kf.split(kernel_train):
nb_fold += 1
train_fold, validation_fold = kernel_train.loc[train_index], kernel_train.loc[validation_index]
svr = SVR(C=parameters["C"], epsilon=parameters["epsilon"])
svr.fit(train_fold[cols_kernel], train_fold["y"])
y_hat_test = svr.predict(validation_fold[cols_kernel])
RMSE_test = RMSE(y_hat_test, validation_fold["y"].values)
KFold_RMSE += RMSE_test
KFold_RMSE /= n_splits
average_RMSE.append(KFold_RMSE)
average_RMSE = np.array(average_RMSE)
print("Cross-validation score: {0} +/- {1}\n".format(average_RMSE.mean(),
2*average_RMSE.std()))
return {"loss": average_RMSE.mean(), "status": STATUS_OK}
In [50]:
# Grid to pick parameters from.
parameters_grid = {"C": hp.choice("C", np.arange(0.5, 3, 0.5)),
"epsilon": hp.choice("epsilon", np.arange(0.05, 0.25, 0.05))
}
# Record the information about the cross-validation.
trials = Trials()
best = fmin(scoring_function, parameters_grid, algo=tpe.suggest, max_evals=10,
trials=trials)
In [51]:
min(trials.losses())
Out[51]:
In [52]:
# Save the best parameters as a csv.
best_parameters = pd.DataFrame({key: [value] for (key, value) in
zip(space_eval(parameters_grid, best).keys(),
space_eval(parameters_grid, best).values())})
# Add the corresponding score.
best_parameters["score"] = min(trials.losses())
best_parameters.to_csv("Parameters/best_parameters_SVR.csv", encoding="utf-8", index=False)
best_parameters
Out[52]:
In [57]:
best_parameters = pd.read_csv("Parameters/best_parameters_SVR.csv", encoding="utf-8")
best_parameters
Out[57]:
In [58]:
svr = SVR(C=best_parameters["C"][0],
epsilon=best_parameters["epsilon"][0])
svr.fit(kernel_train[cols_kernel], y_train)
Out[58]:
In [59]:
# Training error
RMSE(svr.predict(kernel_train[cols_kernel]), y_train)
Out[59]:
In [60]:
# Prediction
y_hat_test = svr.predict(kernel_test[cols_kernel])
test_pred = test.groupby("id")[["y"]].mean().reset_index()
test_pred["y"] = y_hat_test
test_pred.columns = ["Id", "y"]
# Save as a .csv
test_pred.to_csv("Predictions/Prediction_SVR.csv", index=False)
In [ ]: